output.var = params$output.var
transform.abs = params$transform.abs
log.pred = params$log.pred
eda = params$eda
algo.forward = params$algo.forward
algo.backward = params$algo.backward
algo.stepwise = params$algo.stepwise
algo.LASSO = params$algo.LASSO
algo.LARS = params$algo.LARS
message("Parameters used for training/prediction: ")
## Parameters used for training/prediction:
str(params)
## List of 9
## $ output.var : chr "y3"
## $ transform.abs: logi FALSE
## $ log.pred : logi FALSE
## $ eda : logi FALSE
## $ algo.forward : logi FALSE
## $ algo.backward: logi FALSE
## $ algo.stepwise: logi FALSE
## $ algo.LASSO : logi TRUE
## $ algo.LARS : logi FALSE
# Setup Labels
# alt.scale.label.name = Alternate Scale variable name
# - if predicting on log, then alt.scale is normal scale
# - if predicting on normal scale, then alt.scale is log scale
if (log.pred == TRUE){
label.names = paste('log.',output.var,sep="")
alt.scale.label.name = output.var
}
if (log.pred == FALSE){
label.names = output.var
alt.scale.label.name = paste('log.',output.var,sep="")
}
feat = read.csv('../../Data/features.csv')
labels = read.csv('../../Data/labels.csv')
predictors = names(dplyr::select(feat,-JobName))
target = 'y3'
data.ori = inner_join(feat,select_at(labels,c('JobName',target)),by='JobName')
cc = complete.cases(data.ori)
data.notComplete = data.ori[! cc,]
data = data.ori[cc,]
message('Non-Complete cases: ',nrow(data.notComplete))
## Non-Complete cases: 2497
message('Complete cases: ',nrow(data))
## Complete cases: 7503
The Target Variable y3 shows right skewness, so we suggest a log transofrmation (Feature Eng Section)
ggplot(gather(select_at(data,target)), aes(value)) +
geom_histogram(aes(y=..density..),bins = 50,fill='light blue') +
geom_density() +
facet_wrap(~key, scales = 'free',ncol=4)
ggplot(gather(select_at(data,target)), aes(sample=value)) +
stat_qq() +
facet_wrap(~key, scales = 'free',ncol=4)
Normalization of Y3 using bestNormalize package. (suggested orderNorm) This is cool, but I think is too far for the objective of the project
t=bestNormalize::bestNormalize(data$y3)
t
## Best Normalizing transformation with 7503 Observations
## Estimated Normality Statistics (Pearson P / df, lower => more normal):
## - No transform: 3.0956
## - Box-Cox: 1.5118
## - Log_b(x+a): 2.1069
## - sqrt(x+a): 2.5466
## - exp(x): 803.3388
## - arcsinh(x): 2.1069
## - Yeo-Johnson: 1.2209
## - orderNorm: 1.2101
## Estimation method: Out-of-sample via CV with 10 folds and 5 repeats
##
## Based off these, bestNormalize chose:
## orderNorm Transformation with 7503 nonmissing obs and no ties
## - Original quantiles:
## 0% 25% 50% 75% 100%
## 95.913 118.211 123.989 131.055 193.726
qqnorm(data$y3)
qqnorm(predict(t))
orderNorm() is a rank-based procedure by which the values of a vector are mapped to their percentile, which is then mapped to the same percentile of the normal distribution. Without the presence of ties, this essentially guarantees that the transformation leads to a uniform distribution
All predictors show a Fat-Tail situation, where the two tails are very tall, and a low distribution around the mean. The orderNorm transromation can help (see [Best Normalizator] section)
cols = c('x11','x18')
ggplot(gather(select_at(data,cols)), aes(value)) +
geom_histogram(aes(y=..density..),bins = 50,fill='light blue') +
geom_density() +
facet_wrap(~key, scales = 'free',ncol=4)
ggplot(gather(select_at(data,cols)), aes(sample=value)) +
stat_qq()+
facet_wrap(~key, scales = 'free',ncol=4)
lapply(select_at(data,cols),summary)
## $x11
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 9.000e-08 9.500e-08 1.000e-07 1.001e-07 1.050e-07 1.100e-07
##
## $x18
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.500 3.128 4.769 4.769 6.415 7.999
Normalization of X18 using bestNormalize package. (suggested orderNorm) This is cool, but I think is too far for the objective of the project
t=bestNormalize::bestNormalize(data$x18)
## Warning in orderNorm(standardize = TRUE, warn = TRUE, x = c(4.76747513, : Ties in data, Normal distribution not guaranteed
t
## Best Normalizing transformation with 7503 Observations
## Estimated Normality Statistics (Pearson P / df, lower => more normal):
## - No transform: 8.2731
## - Box-Cox: 7.932
## - Log_b(x+a): 10.1169
## - sqrt(x+a): 8.1572
## - exp(x): 120.988
## - arcsinh(x): 9.677
## - Yeo-Johnson: 8.1007
## - orderNorm: 1.0933
## Estimation method: Out-of-sample via CV with 10 folds and 5 repeats
##
## Based off these, bestNormalize chose:
## orderNorm Transformation with 7503 nonmissing obs and ties
## - 7500 unique values
## - Original quantiles:
## 0% 25% 50% 75% 100%
## 1.500 3.128 4.769 6.415 7.999
newx18 = predict(t)
qqnorm(data$x18)
qqnorm(newx18)
orderNorm() is a rank-based procedure by which the values of a vector are mapped to their percentile, which is then mapped to the same percentile of the normal distribution. Without the presence of ties, this essentially guarantees that the transformation leads to a uniform distribution
Normalization of X11 using bestNormalize package. (suggested orderNorm) This is cool, but I think is too far for the objective of the project
t=bestNormalize::bestNormalize(data$x11)
## Warning in orderNorm(standardize = TRUE, warn = TRUE, x = c(1.05e-07, 1.03e-07, : Ties in data, Normal distribution not guaranteed
t
## Best Normalizing transformation with 7503 Observations
## Estimated Normality Statistics (Pearson P / df, lower => more normal):
## - No transform: 13.5364
## - Box-Cox: 13.4991
## - Log_b(x+a): 13.5364
## - sqrt(x+a): 13.5364
## - exp(x): 13.5364
## - arcsinh(x): 13.5364
## - Yeo-Johnson: 13.5364
## - orderNorm: 7.0273
## Estimation method: Out-of-sample via CV with 10 folds and 5 repeats
##
## Based off these, bestNormalize chose:
## orderNorm Transformation with 7503 nonmissing obs and ties
## - 111 unique values
## - Original quantiles:
## 0% 25% 50% 75% 100%
## 0 0 0 0 0
qqnorm(data$x11)
qqnorm( predict(t))
orderNorm() is a rank-based procedure by which the values of a vector are mapped to their percentile, which is then mapped to the same percentile of the normal distribution. Without the presence of ties, this essentially guarantees that the transformation leads to a uniform distribution
All indicators have a strong indication of Fat-Tails
ggplot(gather(select_at(data,predictors)), aes(value)) +
geom_histogram(aes(y=..density..),bins = 50,fill='light blue') +
geom_density() +
facet_wrap(~key, scales = 'free',ncol=4)
ggplot(gather(select_at(data,predictors)), aes(sample=value)) +
stat_qq() +
facet_wrap(~key, scales = 'free',ncol=4)
#chart.Correlation(select(data,-JobName), pch=21)
t=round(cor(dplyr::select(data,-one_of(target,'JobName')),select_at(data,target)),4)
DT::datatable(t)
#chart.Correlation(select(data,-JobName), pch=21)
t=round(cor(dplyr::select(data,-one_of('JobName'))),4)
DT::datatable(t,options=list(scrollX=T))
Scatter plots with all predictors and the target variable (y3)
d = gather(dplyr::select_at(data,c(predictors,target)),key=target,value=value,-y3)
ggplot(data=d, aes(x=value,y=y3)) +
geom_point(color='light blue',alpha=0.5) +
geom_smooth() +
facet_wrap(~target, scales = 'free',ncol=4)
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
No Multicollinearity among predictors
Showing Top predictor by VIF Value
vifDF = usdm::vif(select_at(data,predictors)) %>% arrange(desc(VIF))
head(vifDF,10)
## Variables VIF
## 1 stat147 1.067325
## 2 stat113 1.061632
## 3 stat104 1.059886
## 4 stat31 1.059750
## 5 stat213 1.059603
## 6 stat20 1.059478
## 7 stat178 1.059423
## 8 stat138 1.058587
## 9 stat120 1.058544
## 10 stat2 1.058194
No trasnformation for x18
log transformatio for y3
df=data %>%
mutate(x18sqrt = sqrt(x18)
,y3log = log(y3)
)
target='y3log'
cols=c('y3','y3log','x18','x18sqrt')
pre and post trasnformation
ggplot(gather(select_at(df,cols)), aes(value)) +
geom_histogram(aes(y=..density..),bins = 50,fill='light blue') +
geom_density() +
facet_wrap(~key, scales = 'free',ncol=4)
Vs y3log
cols2=cols[!cols %in% c('y3')]
d = gather(dplyr::select_at(df,cols2),key=target,value=value,-y3log)
ggplot(data=d, aes(x=value,y=y3log)) +
geom_point(color='light blue',alpha=0.5) +
geom_smooth() +
facet_wrap(~target, scales = 'free',ncol=4)
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
#removing unwanted variables
df=df %>%
dplyr::select(-x18sqrt,-y3)
Scatter plots with all predictors and the transformed target variable (y3LOG)
d = gather(dplyr::select_at(df,c(predictors,target)),key=target,value=value,-y3log)
ggplot(data=d, aes(x=value,y=y3log)) +
geom_point(color='light green',alpha=0.5) +
geom_smooth() +
facet_wrap(~target, scales = 'free',ncol=4)
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
the target ariable y3 can be LOG transformed
the predictor x18 is not improving with SQR trasformatioatn
all predictors could benefit with a orderNorm transformation